This document is a lab-book for visual data minining carried out on the CHUSJ dataset.
# Read dataset from SPSS file
BD_chusj <- haven::read_sav(file.path("data", "BD_chusj_MAIN.sav"))
BD_chusj <- BD_chusj %>% zap_formats %>% zap_label %>% zap_widths
hist(BD_chusj$Age, col = "azure3",
main = "Histogram of Age",
xlab = "Age" )
# Add boxplot
par(new = TRUE)
boxplot(BD_chusj$Age,
horizontal = TRUE,
axes = FALSE,
boxwex = 0.15,
col = rgb(0.81, 0.85, 0.9, alpha = 0.5))
FirstDate <-
BD_chusj %>%
select(DateOfFirstPositiveLabResult) %>%
group_by(date = DateOfFirstPositiveLabResult) %>%
summarise(freq = n())
FirstDate_daily <- xts::as.xts(FirstDate$freq, order.by = as.Date(FirstDate$date))
FirstDate_weekly <- xts::apply.weekly(FirstDate_daily, sum)
FirstDate_monthly <- xts::apply.monthly(FirstDate_daily, sum)
par(mfrow = c(3,1))
plot(FirstDate_daily, col = "cadetblue4", lwd = 3,
main = "Date of first positive lab result - daily")
plot(FirstDate_weekly, col = "cadetblue4", lwd = 3,
main = "Date of first positive lab result - weekly")
plot(FirstDate_monthly, col = "cadetblue4", lwd = 3,
main = "Date of first positive lab result - monhly")
par(mfrow = c(1,1))
# Clear aux variables
rm(list = ls()[grep("^FirstDate", ls())])
comorbidities <- BD_chusj[,c(7:18, 20:21, 23)]
comorbidities <- sapply(comorbidities, sum) %>% as.data.frame()
comorbidities <- data.frame(comorbidities = rownames(comorbidities),
count = comorbidities[,1])
ggplot(data = comorbidities, aes(x = reorder(comorbidities, count), y = count)) +
geom_bar(stat = 'identity', color = "cadetblue4", fill = "azure3", width = 0.85) +
geom_text(aes(label = count), hjust = -0.3, color = "slategray4", size = 3.5) +
coord_flip() +
ggtitle(label = "Comorbidities",
subtitle = "Frequency in 2688 patients") +
labs(x = "", y = "")
rm(comorbidities)
comorbidities <- BD_chusj[,c(7:18, 20:21, 23)]
comorbidities <- sapply(comorbidities, sum) %>% t() %>% as.data.frame()
comorbidities <- comorbidities / nrow(BD_chusj) # convert to percentage
# Find max and round it (ceiling) to the nearest .1 multiple
max.axis <- plyr::round_any(max(comorbidities), 0.1, f = ceiling)
# Prepare data for radar chart
comorbidities <- rbind(max = rep(max.axis, 15),
min = rep(0, 15),
value = comorbidities)
radarchart(comorbidities,
axistype = 1,
# custom polygon
pcol = rgb(0.2, 0.5, 0.5, 0.9), # line color
pfcol = rgb(0.2, 0.5, 0.5, 0.2), # fill-in color
plwd = 2, # line width
# custom the grid
cglcol = "grey20", # net color
cglty = 3, # net line type
axislabcol = "grey50", # net label color
cglwd = 0.8, # net width
caxislabels = scales::percent(seq(0, max.axis, length.out = 5)),
# custom labels
calcex = 0.9, # font size of net labels
vlcex = 0.8, # font size of outer labels
title = "Overall frequency of comorbidities (%)")
#' ATCs and Comorbidities
ATC_lv1 <- BD_chusj[, c(469, 378:390)]
comorbidities <- BD_chusj[,c(7:18, 20:21, 23)]
ATCs_comorbidities <- cbind(ATC_lv1, comorbidities)
# Create an empty dataset
ATC.Comorbs_matrix <- data.frame(matrix(NA,
ncol = ncol(comorbidities),
nrow = ncol(ATC_lv1)))
colnames(ATC.Comorbs_matrix) <- names(comorbidities)
rownames(ATC.Comorbs_matrix) <- names(ATC_lv1)
# Compute matrix
for (atc in rownames(ATC.Comorbs_matrix)) {
for (comorb in colnames(ATC.Comorbs_matrix)) {
ATC.Comorbs_matrix[atc, comorb] <-
ATCs_comorbidities[ATCs_comorbidities[,atc] == 1 & ATCs_comorbidities[,comorb] == 1,
c(atc, comorb)] %>% nrow()
}
}
rm(atc, comorb, ATCs_comorbidities, comorbidities)
# Check Top-N ATC families
ATCs <- sapply(ATC_lv1, sum) %>% as.data.frame() %>% arrange(desc(.))
# Select Top-n ATC families
ATCs <- ATCs %>% head(4)
# Prepare data for radar chart
radar.data <- ATC.Comorbs_matrix[rownames(ATCs), ]
# Find max and round it (ceiling) to the nearest .1 multiple
max.axis <- plyr::round_any(max(radar.data), 50, f = ceiling)
radar.data <- rbind(max = rep(max.axis, ncol(radar.data)),
min = rep(0, ncol(radar.data)),
radar.data)
# Define colors
colors <- RColorBrewer::brewer.pal(nrow(radar.data) - 2, "Set1")
colors_border <- colors
colors_in <- scales::alpha(colors, alpha = 0.15)
radarchart(radar.data,
axistype = 1,
# custom polygon
pcol = colors_border, # line color
pfcol = colors_in, # fill-in color
plwd = 2, # line width
plty = 1, # line type
# custom the grid
cglcol = "grey20", # net color
cglty = 3, # net line type
axislabcol = "grey50", # net label color
cglwd = 0.8, # net width
caxislabels = seq(0, max.axis, length.out = 5),
# custom labels
calcex = 0.8,
vlcex = 0.7,
title = "Frequency of top-4 ATC groups per Comorbidity (n)")
legend(x = 1.1,
y = -0.7,
legend = rownames(radar.data[-c(1,2),]),
bty = "n",
pch = 20,
col = colors_border,
text.col = "grey40",
cex = 0.9,
pt.cex = 2)
comorbidities <- BD_chusj[,c(7:18, 20:21, 23)]
comorbidities <- cbind(Sex = BD_chusj$Gender,
comorbidities)
comorbidities <- rbind(
# total = comorbidities %>% sapply(., sum) %>% t() %>% as.data.frame(),
male = comorbidities %>% filter(Sex == 1) %>% sapply(., sum) %>% t() %>% as.data.frame(),
female = comorbidities %>% filter(Sex == 0) %>% sapply(., sum) %>% t() %>% as.data.frame())
comorbidities$Sex <- NULL
# convert to percentage
comorbidities <- comorbidities / nrow(BD_chusj)
# Find max and round it (ceiling) to the nearest .1 multiple
max.axis <- plyr::round_any(max(comorbidities), 0.1, f = ceiling)
# Prepare data for radar chart
comorbidities <- rbind(max = rep(max.axis, length(comorbidities)),
min = rep(0, length(comorbidities)),
comorbidities)
# Define colors
# colors <- RColorBrewer::brewer.pal(3, "BuPu")
colors <- c("dodgerblue3", "hotpink1")
colors_border <- colors
colors_in <- c(scales::alpha(colors[1], 0.15), scales::alpha(colors[2], 0.15))
radarchart(comorbidities,
axistype = 1,
# custom polygon
pcol = colors_border, # line color
pfcol = colors_in, # fill-in color
plwd = 2, # line width
plty = 1, # line type
# custom the grid
cglcol = "grey20", # net color
cglty = 3, # net line type
axislabcol = "grey50", # net label color
cglwd = 0.8, # net width
caxislabels = scales::percent(seq(0, max.axis, length.out = 5)),
# custom labels
calcex = 0.8,
vlcex = 0.7,
title = "Frequency of Comorbidities per Sex (%)")
legend(x = 1.1,
y = -1,
legend = rownames(comorbidities[-c(1,2),]),
bty = "n",
pch = 20,
col = colors_border,
text.col = "grey40",
cex = 0.9,
pt.cex = 2)
{
cat("Frequencies in percentage (%):\n")
(comorbidities[-c(1:2),] * 100) %>% round(., digits = 1) %>%
t() %>% as.data.frame() %>% print()
}
## Frequencies in percentage (%):
## male female
## CANC 17.2 14.0
## CEREBROVASCULAR 6.0 4.6
## DIAB 16.9 12.4
## KIDNEY 9.3 6.3
## LIVER 5.1 2.1
## LUNG_I 7.1 1.5
## LUNG_II 5.8 3.9
## HEART 11.7 7.6
## TRANSP 1.2 0.5
## OBESITY 8.9 10.8
## SMOKING 15.6 1.6
## NERVOUS 14.4 12.8
## ASTHMA 1.7 2.5
## HYPERTENSION 1.7 2.5
## PreconditionOther 8.3 4.4
comorbidities <- BD_chusj[,c(7:18, 20:21, 23)]
comorbidities <- cbind(IntensiveCare = BD_chusj$IntensiveCare,
comorbidities)
comorbidities <- rbind(
# total = comorbidities %>% sapply(., sum) %>% t() %>% as.data.frame(),
IC_no = comorbidities %>% filter(IntensiveCare == 0) %>% sapply(., sum) %>% t() %>% as.data.frame(),
IC_yes = comorbidities %>% filter(IntensiveCare == 1) %>% sapply(., sum) %>% t() %>% as.data.frame())
comorbidities$IntensiveCare <- NULL
# convert to percentage
comorbidities <- comorbidities / nrow(BD_chusj)
# comorbidities %>% t() %>% round(., digits = 3) %>% as.data.frame() %>% print()
# Find max and round it (ceiling) to the nearest .1 multiple
max.axis <- plyr::round_any(max(comorbidities), 0.1, f = ceiling)
# Prepare data for radar chart
comorbidities <- rbind(max = rep(max.axis, length(comorbidities)),
min = rep(0, length(comorbidities)),
comorbidities)
# Define colors
# colors <- RColorBrewer::brewer.pal(3, "BuPu")
colors <- c("seagreen4", "indianred3")
colors_border <- colors
colors_in <- c(scales::alpha(colors[1], 0.3), scales::alpha(colors[2], 0.6))
radarchart(comorbidities,
axistype = 1,
# custom polygon
pcol = colors_border, # line color
pfcol = colors_in, # fill-in color
plwd = 2, # line width
plty = 1, # line type
# custom the grid
cglcol = "grey20", # net color
cglty = 3, # net line type
axislabcol = "grey50", # net label color
cglwd = 0.8, # net width
caxislabels = scales::percent(seq(0, max.axis, length.out = 5)),
# custom labels
calcex = 0.8,
vlcex = 0.7,
title = "Frequency of Comorbidities in Intensive Care(%)")
legend(x = 1.1,
y = -1,
legend = rownames(comorbidities[-c(1,2),]),
bty = "n",
pch = 20,
col = colors_border,
text.col = "grey40",
cex = 0.9,
pt.cex = 2)
{
cat("Frequencies in percentage (%):\n")
(comorbidities[-c(1:2),] * 100) %>% round(., digits = 1) %>%
t() %>% as.data.frame() %>% print()
}
## Frequencies in percentage (%):
## IC_no IC_yes
## CANC 21.3 9.9
## CEREBROVASCULAR 8.1 2.5
## DIAB 19.6 9.7
## KIDNEY 11.2 4.4
## LIVER 4.1 3.0
## LUNG_I 5.7 2.9
## LUNG_II 3.5 6.2
## HEART 13.9 5.4
## TRANSP 0.9 0.7
## OBESITY 10.4 9.3
## SMOKING 9.9 7.3
## NERVOUS 20.5 6.7
## ASTHMA 2.9 1.3
## HYPERTENSION 2.9 1.3
## PreconditionOther 8.9 3.8
comorbidities <- BD_chusj[,c(7:18, 20:21, 23)]
comorbidities <- cbind(Outcome = BD_chusj$Outcome,
comorbidities)
comorbidities <- rbind(
# total = comorbidities %>% sapply(., sum) %>% t() %>% as.data.frame(),
Recovered = comorbidities %>% filter(Outcome == 0) %>% sapply(., sum) %>% t() %>% as.data.frame(),
Died = comorbidities %>% filter(Outcome == 1) %>% sapply(., sum) %>% t() %>% as.data.frame(),
OtherHospital = comorbidities %>% filter(Outcome == 2) %>% sapply(., sum) %>% t() %>% as.data.frame())
comorbidities$Outcome <- NULL
# convert to percentage
comorbidities <- comorbidities / nrow(BD_chusj)
# Find max and round it (ceiling) to the nearest .1 multiple
max.axis <- plyr::round_any(max(comorbidities), 0.1, f = ceiling)
# Prepare data for radar chart
comorbidities <- rbind(max = rep(max.axis, length(comorbidities)),
min = rep(0, length(comorbidities)),
comorbidities)
# Define colors
# colors <- RColorBrewer::brewer.pal(3, "BuPu")
colors <- c("forestgreen", "firebrick3", "dodgerblue3")
colors_border <- colors
colors_in <- c(scales::alpha(colors[1], 0.15), scales::alpha(colors[2], 0.15))
radarchart(comorbidities,
axistype = 1,
# custom polygon
pcol = colors_border, # line color
pfcol = colors_in, # fill-in color
plwd = 2, # line width
plty = 1, # line type
# custom the grid
cglcol = "grey20", # net color
cglty = 3, # net line type
axislabcol = "grey50", # net label color
cglwd = 0.8, # net width
caxislabels = scales::percent(seq(0, max.axis, length.out = 5)),
# custom labels
calcex = 0.8,
vlcex = 0.7,
title = "Frequency of Comorbidities per Outcome (%)")
legend(x = 1.1,
y = -0.8,
legend = rownames(comorbidities[-c(1,2),]),
bty = "n",
pch = 20,
col = colors_border,
text.col = "grey40",
cex = 0.9,
pt.cex = 2)
{
cat("Frequencies in percentage (%):\n")
(comorbidities[-c(1:2),] * 100) %>% round(., digits = 1) %>%
t() %>% as.data.frame() %>% print()
}
## Frequencies in percentage (%):
## Recovered Died OtherHospital
## CANC 19.6 8.5 3.0
## CEREBROVASCULAR 6.0 3.2 1.4
## DIAB 19.6 7.0 2.6
## KIDNEY 9.2 4.8 1.6
## LIVER 4.8 1.7 0.6
## LUNG_I 5.0 2.6 1.0
## LUNG_II 4.8 3.0 2.0
## HEART 11.0 6.0 2.2
## TRANSP 1.3 0.3 0.0
## OBESITY 12.9 3.6 3.1
## SMOKING 10.6 4.2 2.3
## NERVOUS 15.8 8.1 3.2
## ASTHMA 3.4 0.4 0.4
## HYPERTENSION 3.4 0.4 0.4
## PreconditionOther 8.8 2.2 1.5
#' ATC codes (first level) are listed in variables "469 + 378-390"
ATC_lv1 <- BD_chusj[, c(469, 378:390)]
# Create an empty dataset
heatmap.matrix <- data.frame(matrix(NA, ncol = length(ATC_lv1), nrow = length(ATC_lv1)))
colnames(heatmap.matrix) <- rownames(heatmap.matrix) <- names(ATC_lv1)
# Fill in dataset with count of co-occurences
for (i in 1:length(ATC_lv1)) {
for (j in i:length(ATC_lv1)) {
heatmap.matrix[i,j] <- heatmap.matrix[j,i] <- nrow(ATC_lv1[,c(i,j)][ATC_lv1[,i]==1 & ATC_lv1[,j]==1,])
}
}; rm(i, j)
# Reshape data matrix to unrolled data, to feed heatmap
heatmap.data <- reshape2::melt(as.matrix(heatmap.matrix))
colnames(heatmap.data) <- c("x", "y", "value")
# Change co-occurance counts to to percentages
heatmap.data$value <- round(heatmap.data$value / nrow(ATC_lv1), digits = 3)
# Plot graphic
ggplot(heatmap.data, aes(x, y, fill = value)) +
geom_tile(color = "grey95",
lwd = 1,
linetype = 1) +
theme(axis.text.x = element_text(angle = 90),
axis.title.x = element_blank(),
axis.title.y = element_blank()) +
ggtitle(label = "Heatmap",
subtitle = "Co-frequencies in 1st-level ATC drug groups") +
geom_text(aes(label = value), color = "gray30", size = 3) +
scale_fill_gradient(low = "lightsteelblue1", high = "darkolivegreen3") +
guides(fill = guide_colourbar(barwidth = 0.5,
barheight = 20,
title = "%"))
ATC.B is the single most prevalent ATC drug family, being used by 89.1% of patients.
ATC.A + ATC.B is the most prevalent drug family dual association, being jointly used by 66.6% of patients.
groups <- colnames(heatmap.matrix)
group.combinations <- combn(groups, 2) %>% as.data.frame()
cofrequencies <-
sapply(1:ncol(group.combinations),
FUN = function(n){
heatmap.data %>%
filter(x == group.combinations[1,n],
y == group.combinations[2,n]) %>%
select(value)
})
group.combinations <-
cbind(t(group.combinations), cofrequencies) %>%
as.data.frame() %>%
lapply(., as.character) %>%
as.data.frame()
group.combinations$cofrequencies <- as.numeric(group.combinations$cofrequencies)
group.combinations %>% arrange(desc(cofrequencies)) %>% head(10) %>% print
## V1 V2 cofrequencies
## 1 ATC.A ATC.B 0.666
## 2 ATC.B ATC.N 0.556
## 3 ATC.B ATC.J 0.524
## 4 ATC.A ATC.N 0.507
## 5 ATC.B ATC.R 0.495
## 6 ATC.A ATC.J 0.464
## 7 ATC.J ATC.N 0.440
## 8 ATC.B ATC.C 0.400
## 9 ATC.A ATC.R 0.396
## 10 ATC.A ATC.C 0.360
The most frequent dual combinations are the following, all of which occurring in more than 50% of patients:
ATC.A + ATC.B : 66,6 %
ATC.B + ATC.N : 55,6 %
ATC.B + ATC.J : 52,4 %
ATC.A + ATC.N : 50,7 %
heatmap_2groups <- function(index1 = NA,
index2 = NA) {
#' This function plots a heatmap of 2 ATC groups at 2nd level
#' inputs: the indexes on the 2nd levels codes for each group
#' ATC.A: 362:377 (16 codes)
#' ATC.B: 391:395 (5 codes)
#' ATC.J: 425:430 (6 codes)
#' ATC.N: 441:447 (7 codes)
ATC_group1 <- BD_chusj[, index1]
ATC_group2 <- BD_chusj[, index2]
# Create an empty dataset
heatmap.matrix <- data.frame(matrix(NA,
ncol = ncol(ATC_group1),
nrow = ncol(ATC_group2)))
colnames(heatmap.matrix) <- names(ATC_group1)
rownames(heatmap.matrix) <- names(ATC_group2)
ATC_groups12 <- cbind(ATC_group1, ATC_group2)
for (g1 in seq_along(ATC_group1)) {
for (g2 in seq_along(ATC_group2)) {
heatmap.matrix[g2, g1] <-
nrow(ATC_groups12[, c(g1, g2 + ncol(ATC_group1))][ATC_groups12[,g1]==1 &
ATC_groups12[,g2+ncol(ATC_group1)]==1,])
}
}
heatmap.data <- reshape2::melt(as.matrix(heatmap.matrix))
colnames(heatmap.data) <- c("x", "y", "value")
# change values to percentage
# heatmap.data$value <- round(heatmap.data$value / nrow(ATC_lv1), digits = 3)
family.group1 <- substring(names(ATC_group1)[1], 5, 5)
family.group2 <- substring(names(ATC_group2)[1], 5, 5)
p <-
ggplot(heatmap.data, aes(x, y, fill = value)) +
geom_tile(color = "grey95",
lwd = 1,
linetype = 1) +
theme(axis.text.x = element_text(angle = 0),
axis.title.x = element_blank(),
axis.title.y = element_blank()) +
ggtitle(label = "Heatmap",
subtitle = paste("Co-frequencies in 2nd-level ATC drug groups",
family.group1, "and", family.group2)) +
geom_text(aes(label = value), color = "gray30", size = 3) +
scale_fill_gradient(low = "lightsteelblue1", high = "darkolivegreen3") +
# coord_fixed() +
guides(fill = guide_colourbar(barwidth = 0.5,
barheight = 20,
title = "freq"))
# Find most prevalent combinations
heatmap.data %>%
arrange(desc(value)) %>%
mutate(perc = round(value / nrow(BD_chusj), 3)) %>%
relocate(y) %>%
rename(!!family.group1 := y, !!family.group2 := x, freq = value) %>%
head() %>% print()
return(p)
}
heatmap_2groups(index1 = 362:377, index2 = 391:395) # A.B
## A B freq perc
## 1 ATC.A02 ATC.B01 894 0.333
## 2 ATC.A02 ATC.B05 850 0.316
## 3 ATC.A03 ATC.B01 832 0.310
## 4 ATC.A10 ATC.B01 811 0.302
## 5 ATC.A03 ATC.B05 770 0.286
## 6 ATC.A10 ATC.B05 685 0.255
heatmap_2groups(index1 = 391:395, index2 = 441:447) # B.N
## B N freq perc
## 1 ATC.B01 ATC.N02 1233 0.459
## 2 ATC.B05 ATC.N02 1116 0.415
## 3 ATC.B01 ATC.N05 664 0.247
## 4 ATC.B05 ATC.N05 642 0.239
## 5 ATC.B01 ATC.N01 480 0.179
## 6 ATC.B05 ATC.N01 447 0.166
heatmap_2groups(index1 = 391:395, index2 = 425:430) # B.J
## B J freq perc
## 1 ATC.B01 ATC.J01 1293 0.481
## 2 ATC.B05 ATC.J01 1110 0.413
## 3 ATC.B03 ATC.J01 203 0.076
## 4 ATC.B02 ATC.J01 93 0.035
## 5 ATC.B01 ATC.J05 70 0.026
## 6 ATC.B05 ATC.J02 68 0.025
heatmap_2groups(index1 = 362:377, index2 = 441:447) # A.N
## A N freq perc
## 1 ATC.A03 ATC.N02 830 0.309
## 2 ATC.A02 ATC.N02 810 0.301
## 3 ATC.A10 ATC.N02 583 0.217
## 4 ATC.A02 ATC.N05 494 0.184
## 5 ATC.A03 ATC.N05 484 0.180
## 6 ATC.A02 ATC.N01 423 0.157